University of California, Santa Barbara


Load packages

library(tidyverse)
library(MplusAutomation)
library(here)
library(DiagrammeR)
library(glue)
library(cowplot)
library(gt)
library(Hmisc)

here::i_am("growth_mixtures.Rmd")

Example: Science Summary Scores


This example looks at science IRT scores over time (Grades 7-12) See documentation here


Prepare Data

#library(haven)
#lsay <- read_sav(here("lsay.sav"))
#lsay_sci <- lsay %>%
#  select(CASENUM, ASCIIRT, CSCIIRT, ESCIIRT, GSCIIRT, ISCIIRT, KSCIIRT) 
#(lsay_sci, "lsay_sci.csv")

lsay_sci <- read_csv(here("data","lsay_sci.csv")) %>% 
  rename(
    id = CASENUM,
    sci7 = ASCIIRT,
    sci8 = CSCIIRT,
    sci9 = ESCIIRT, 
    sci10 = GSCIIRT,
    sci11 = ISCIIRT,
    sci12 = KSCIIRT
  ) 

Descriptive Statistics

lsay_sci %>% 
  select(-id) %>% 
  psych::describe()
##       vars    n  mean    sd median trimmed   mad   min   max range  skew
## sci7     1 3071 50.41 10.20  50.04   50.18 11.33 26.14 88.03 61.89  0.20
## sci8     2 2527 54.05 11.16  54.64   54.25 12.11 22.82 83.94 61.12 -0.17
## sci9     3 2326 58.69 11.24  60.40   59.15 11.05 27.36 91.21 63.85 -0.35
## sci10    4 4690 60.32 11.02  60.84   60.50 11.72 26.97 91.33 64.36 -0.14
## sci11    5 3592 64.10 11.21  64.75   64.51 11.10 24.44 93.13 68.69 -0.34
## sci12    6 2826 65.85 11.65  66.25   66.28 11.14 26.38 95.56 69.18 -0.35
##       kurtosis   se
## sci7     -0.46 0.18
## sci8     -0.65 0.22
## sci9     -0.40 0.23
## sci10    -0.43 0.16
## sci11    -0.17 0.19
## sci12     0.02 0.22

Correlation Table

cor_data <- lsay_sci %>% 
  select(-id)

rcorr(as.matrix(cor_data)) 
##       sci7 sci8 sci9 sci10 sci11 sci12
## sci7  1.00 0.81 0.76  0.75  0.74  0.74
## sci8  0.81 1.00 0.85  0.81  0.81  0.76
## sci9  0.76 0.85 1.00  0.88  0.86  0.82
## sci10 0.75 0.81 0.88  1.00  0.88  0.83
## sci11 0.74 0.81 0.86  0.88  1.00  0.90
## sci12 0.74 0.76 0.82  0.83  0.90  1.00
## 
## n
##       sci7 sci8 sci9 sci10 sci11 sci12
## sci7  3071 2494 2297  1958  1573  1134
## sci8  2494 2527 2067  1745  1436  1051
## sci9  2297 2067 2326  1792  1432  1043
## sci10 1958 1745 1792  4690  3323  2621
## sci11 1573 1436 1432  3323  3592  2433
## sci12 1134 1051 1043  2621  2433  2826
## 
## P
##       sci7 sci8 sci9 sci10 sci11 sci12
## sci7        0    0    0     0     0   
## sci8   0         0    0     0     0   
## sci9   0    0         0     0     0   
## sci10  0    0    0          0     0   
## sci11  0    0    0    0           0   
## sci12  0    0    0    0     0

Spaghetti Plot

plot_data <- lsay_sci[1:500,] %>%
  drop_na() %>% 
  pivot_longer(cols = starts_with("sci"),  
               names_to = "grade",          
               values_to = "value") %>% 
  mutate(grade = factor(grade, 
                        levels = c("sci7", "sci8", "sci9", "sci10", "sci11", "sci12"), 
                        labels = c(7,8,9,10,11,12)))

mean_sci <- plot_data %>%
  drop_na() %>% 
  group_by(grade) %>%
  summarise(mean_response = mean(value),
            se_response = sd(value) / sqrt(n()))

ggplot() +                                                                   
  geom_point(data = plot_data, aes(x = grade, y = value, group = id), alpha = .3) +     
  geom_line(data = plot_data, aes(x = grade, y = value, group = id), alpha = .3) +      
  geom_point(data=mean_sci, aes(x=grade, y = mean_response), color = "Blue", size = 1.2) +               
  geom_line(data=mean_sci, aes(x=grade, y = mean_response, group = 1), color = "blue", size = 1.2) + 
  geom_errorbar(data = mean_sci, aes(x = grade, ymin = mean_response - se_response, 
                                      ymax = mean_response + se_response),
                width = 0.2, size = 1.2, color = "blue") +  
  labs(title = "Spaghetti Plot with Mean Line and Error Bars",,
       x="Grade", 
       y="Science Score") +                                                           
  theme_cowplot()                                                                              


Growth Mixture Model

gmm_6  <- lapply(1:6, function(k) {
  gmm_enum  <- mplusObject(
      
    TITLE = glue("GMM {k}-Class"), 
  
    VARIABLE = glue(
    "usevar = sci7-sci12; 
     classes = c({k}); "),
  
  ANALYSIS = 
   "estimator = mlr; 
    type = mixture;
    starts = 200 100; 
    processors = 10;",
  
  MODEL = 
    "%OVERALL%
       i s | sci7@0 sci8@1 sci9@2 sci10@3 sci11@4 sci12@5;",
  
  OUTPUT = "tech1 tech11 tech14 sampstat standardized;",
  
  SAVEDATA = 
    glue("FILE IS savedata_c{k}.dat;
     SAVE = cprobabilities;"),
  
  PLOT = "type=plot3;
          series = sci7-sci12(*)",
  
  usevariables = colnames(lsay_sci),
  rdata = lsay_sci)

gmm_enum_fit <- mplusModeler(gmm_enum, 
                            dataout=glue(here("gmm_enum", "gmm_lsay.dat")),
                            modelout=glue(here("gmm_enum", "c{k}_gmm_lsay.inp")) ,
                            check=TRUE, run = TRUE, hashfilename = FALSE)
})

Table of Fit

First, extract data:

output_gmm <- readModels(here("gmm_enum"), filefilter = "gmm", quiet = TRUE)

# Extract fit indices
enum_extract <- LatexSummaryTable(
  output_gmm,
  keepCols = c(
    "Title",
    "Parameters",
    "LL",
    "BIC",
    "aBIC",
    "BLRT_PValue",
    "T11_VLMR_PValue",
    "Observations"
  ),
  sortBy = "Title"
)

# Extract lowest class size
min_sizes <- map_df(names(output_gmm), ~ {
  model <- output_gmm[[.x]]
  min_size <- min(model$class_counts$modelEstimated$proportion) * 100
  tibble(Model = .x, min_cs = round(min_size, 2))
})

# Combine dataframe
combined <- cbind(enum_extract, min_sizes)

# Calculate additional fit indices
allFit <- combined %>%
  mutate(CAIC = -2 * LL + Parameters * (log(Observations) + 1)) %>%
  mutate(AWE = -2 * LL + 2 * Parameters * (log(Observations) + 1.5)) %>%
  mutate(SIC = -.5 * BIC) %>%
  mutate(expSIC = exp(SIC - max(SIC))) %>%
  mutate(BF = exp(SIC - lead(SIC))) %>%
  mutate(cmPk = expSIC / sum(expSIC)) %>%
  dplyr::select(Title, Parameters, min_cs, LL, BIC, aBIC, CAIC, AWE, BLRT_PValue, T11_VLMR_PValue, BF, cmPk) %>%
  arrange(Parameters)

Then, create table:

fit_table1 <- allFit %>%
  gt() %>%
  tab_header(title = md("**Model Fit Summary Table**")) %>%
  cols_label(
    Title = "Classes",
    Parameters = md("Par"),
    min_cs = md("Min. Class Size"),
    LL = md("*LL*"),
    T11_VLMR_PValue = "VLMR",
    BLRT_PValue = "BLRT",
    BF = md("BF"),
    cmPk = md("*cmPk*")
  ) %>%
  tab_footnote(
    footnote = md(
      "*Note.* Par = Parameters; *LL* = model log likelihood;
BIC = Bayesian information criterion;
aBIC = sample size adjusted BIC; CAIC = consistent Akaike information criterion;
AWE = approximate weight of evidence criterion;
BLRT = bootstrapped likelihood ratio test p-value;
VLMR = Vuong-Lo-Mendell-Rubin adjusted likelihood ratio test p-value;
*cmPk* = approximate correct model probability."
    ),
locations = cells_title()
  ) %>%
  tab_options(column_labels.font.weight = "bold") %>%
  fmt_number(c(3:8),
             decimals = 2) %>%
  fmt_missing(1:12,
              missing_text = "--") %>%
  fmt(
    c(9:10, 12),
    fns = function(x)
      ifelse(x < 0.001, "<.001",
             scales::number(x, accuracy = .01))
  ) %>%
  fmt(
    11,
    fns = function (x)
      ifelse(x > 100, ">100",
             scales::number(x, accuracy = .01))
  ) %>%  
  tab_style(
    style = list(
      cell_text(weight = "bold")
      ),
    locations = list(cells_body(
     columns = BIC,
     row = BIC == min(BIC[c(1:6)]) # Change this to the number of classes you estimated
    ),
    cells_body(
     columns = aBIC,
     row = aBIC == min(aBIC[1:6])
    ),
    cells_body(
     columns = CAIC,
     row = CAIC == min(CAIC[1:6])
    ),
    cells_body(
     columns = AWE,
     row = AWE == min(AWE[1:6])
    ),
    cells_body(
     columns = cmPk,
     row =  cmPk == max(cmPk[1:6])
     ),    
    cells_body(
     columns = BF,
     row =  BF > 10),
    cells_body( 
     columns =  T11_VLMR_PValue,
     row =  ifelse(T11_VLMR_PValue < .001 & lead(T11_VLMR_PValue) > .05, T11_VLMR_PValue < .001, NA)),
    cells_body(
     columns =  BLRT_PValue,
     row =  ifelse(BLRT_PValue < .001 & lead(BLRT_PValue) > .05, BLRT_PValue < .001, NA))
  )
)

fit_table1
Model Fit Summary Table1
Classes Par Min. Class Size LL BIC aBIC CAIC AWE BLRT VLMR BF cmPk
GMM 1-Class 11 100.00 −63,865.66 127,826.77 127,791.82 127,837.77 127,955.22 – – 0.00 <.001
GMM 2-Class 14 15.44 −63,782.56 127,686.59 127,642.10 127,700.59 127,850.06 <.001 <.001 0.00 <.001
GMM 3-Class 17 14.92 −63,732.16 127,611.83 127,557.81 127,628.83 127,810.34 <.001 <.001 0.00 <.001
GMM 4-Class 20 2.48 −63,698.52 127,570.57 127,507.01 127,590.57 127,804.11 <.001 <.001 0.18 <.001
GMM 5-Class 23 1.62 −63,683.78 127,567.14 127,494.05 127,590.14 127,835.71 <.001 0.11 0.00 <.001
GMM 6-Class 26 1.30 −63,663.16 127,551.92 127,469.30 127,577.92 127,855.52 <.001 0.00 – 1.00
1 Note. Par = Parameters; LL = model log likelihood; BIC = Bayesian information criterion; aBIC = sample size adjusted BIC; CAIC = consistent Akaike information criterion; AWE = approximate weight of evidence criterion; BLRT = bootstrapped likelihood ratio test p-value; VLMR = Vuong-Lo-Mendell-Rubin adjusted likelihood ratio test p-value; cmPk = approximate correct model probability.

Information Criteria Plot

allFit %>%
  dplyr::select(LL:AWE) %>%
  rowid_to_column() %>%
  pivot_longer(`BIC`:`AWE`,
               names_to = "Index",
               values_to = "ic_value") %>%
  mutate(Index = factor(Index,
                        levels = c ("AWE", "CAIC", "BIC", "aBIC"))) %>%
  ggplot(aes(
    x = rowid,
    y = ic_value,
    color = Index,
    shape = Index,
    group = Index,
    lty = Index
  )) +
  geom_point(size = 2.0) + geom_line(size = .8) +
  scale_x_continuous(breaks = 1:nrow(allFit)) +
  scale_colour_grey(end = .5) +
  theme_cowplot() +
  labs(x = "Number of Classes", y = "Information Criteria Value", title = "Information Criteria") +
  theme(
    text = element_text(family = "Times", size = 12),
    legend.text = element_text(family="Times", size=12),
    legend.key.width = unit(3, "line"),
    legend.title = element_blank(),
    legend.position = "top"  
  )


Plot GMM

plotGrowthMixtures(output_gmm, estimated = TRUE, rawdata = TRUE, 
                   time_scale = c(1, 2, 3, 4, 5, 6), alpha_range = c(0, 0.01))